#----------------------------------------------
# Measuring Misperceptions: Limits of Party-Specific Stereotype Reports
# Orr and Huber, 2021, POQ
# Inputs: OrrHuber_POQ_Study2_raw.csv
# Outputs: OrrHuber_POQ_Study2_clean.csv
#----------------------------------------------



#----------------------------------------------
# Load Packages
#----------------------------------------------

library(survey)
library(dplyr)


#----------------------------------------------
# Load Data
#----------------------------------------------

dat <- read.csv("OrrHuber_POQ_Study2_raw.csv", as.is = TRUE)


#----------------------------------------------
# Reweighting
#----------------------------------------------

# Modeled off of code written by Seth Hill - thank you!

# Set seed

set.seed(2021)

# Save target proportions (From US census bureau)

pop_targs <- list(data.frame(female = c(0, 1),
                             Freq = nrow(dat)*c(0.492, 0.508)),
                  data.frame(race = c("White", "Black", "Asian", "Hispanic", "Other"),
                             Freq = nrow(dat)*c(0.637474968, 0.122061191, 0.046851281, 0.163492546, 0.030120014)),
                  data.frame(ageXeduc = c('18-29 & 1', '30-49 & 1', '50-69 & 1', '70+ & 1', # High school or less
                                          '18-29 & 2', '30-49 & 2', '50-69 & 2', '70+ & 2', # Some college, associate, or post HS training
                                          '18-29 & 3', '30-49 & 3', '50-69 & 3', '70+ & 3', # Bachelors
                                          '18-29 & 4', '30-49 & 4', '50-69 & 4', '70+ & 4'), # Graduate degree
                             Freq = nrow(dat)*c(0.083309387, 0.11475894, 0.12563857,	0.065449393,
                                                0.075067848, 0.083477011,	0.084921775,	0.034211367,
                                                0.04070083, 0.083313378,	0.06539751,	0.023359674,
                                                0.009905811, 0.050921935, 0.041183748,	0.018382822)), 
                  data.frame(hhincome = c(1:24),
                             Freq = nrow(dat)*c(0.10211462, 0.044889992, 0.044112272, 0.0425335, 0.045278851, 0.042027982, 
                                                0.040402548, 0.037633865, 0.041219154, 0.034351887, 0.035806223, 0.031101018, 
                                                0.029514469, 0.030719935, 0.026045839, 0.023829337, 0.024124871, 0.02007295, 
                                                0.090277724, 0.059215592, 0.042821257, 0.026769118, 0.035557353, 0.049579642)))



# Recode demographic data to match target categories

dat$female <- as.numeric(dat$gender == 2)

dat$race <- dplyr::recode(dat$ethnicity, .default = "Other",
                   "1" = "White", "2" = "Black", "4" = "Asian", "5" = "Asian",
                   "6" = "Asian", "7" = "Asian", "8" = "Asian", 
                   "9" = "Asian", "10" = "Asian")
dat$race[as.numeric(dat$hispanic) %in% c(2:14)] <- "Hispanic"
dat$race <- factor(dat$race, levels = c("White", "Hispanic", "Black", "Asian", "Other"))


dat$age_recode <- as.numeric(dat$age)
dat$edu_recode <- as.numeric(dat$education)
dat$age_recode[is.na(dat$age_recode) | dat$age_recode == 2] <- 
  sample(dat$age_recode[!is.na(dat$age_recode) | dat$age_recode == 2],
         sum(is.na(dat$age_recode) | dat$age_recode == 2),
         replace = TRUE)
dat$edu_recode[is.na(dat$edu_recode) | dat$edu_recode == -3105] <- 
  sample(dat$edu_recode[!is.na(dat$edu_recode) & dat$edu_recode != -3105],
         sum(is.na(dat$edu_recode) | dat$edu_recode == -3105),
         replace = TRUE)

dat$ageXeduc <- NA
dat$ageXeduc[dat$age_recode %in% c(18:29) & dat$edu_recode %in% c(1:2)] <- '18-29 & 1'
dat$ageXeduc[dat$age_recode %in% c(18:29) & dat$edu_recode %in% c(3:5)] <- '18-29 & 2'
dat$ageXeduc[dat$age_recode %in% c(18:29) & dat$edu_recode %in% c(6)] <- '18-29 & 3'
dat$ageXeduc[dat$age_recode %in% c(18:29) & dat$edu_recode %in% c(7:8)] <- '18-29 & 4'
dat$ageXeduc[dat$age_recode %in% c(30:49) & dat$edu_recode %in% c(1:2)] <- '30-49 & 1'
dat$ageXeduc[dat$age_recode %in% c(30:49) & dat$edu_recode %in% c(3:5)] <- '30-49 & 2'
dat$ageXeduc[dat$age_recode %in% c(30:49) & dat$edu_recode %in% c(6)] <- '30-49 & 3'
dat$ageXeduc[dat$age_recode %in% c(30:49) & dat$edu_recode %in% c(7:8)] <- '30-49 & 4'
dat$ageXeduc[dat$age_recode %in% c(50:69) & dat$edu_recode %in% c(1:2)] <- '50-69 & 1'
dat$ageXeduc[dat$age_recode %in% c(50:69) & dat$edu_recode %in% c(3:5)] <- '50-69 & 2'
dat$ageXeduc[dat$age_recode %in% c(50:69) & dat$edu_recode %in% c(6)] <- '50-69 & 3'
dat$ageXeduc[dat$age_recode %in% c(50:69) & dat$edu_recode %in% c(7:8)] <- '50-69 & 4'
dat$ageXeduc[dat$age_recode %in% c(70:95) & dat$edu_recode %in% c(1:2)] <- '70+ & 1'
dat$ageXeduc[dat$age_recode %in% c(70:95) & dat$edu_recode %in% c(3:5)] <- '70+ & 2'
dat$ageXeduc[dat$age_recode %in% c(70:95) & dat$edu_recode %in% c(6)] <- '70+ & 3'
dat$ageXeduc[dat$age_recode %in% c(70:95) & dat$edu_recode %in% c(7:8)] <- '70+ & 4'

dat$hhincome <- as.numeric(dat$hhi)
dat$hhincome[dat$hhincome == -3105 | is.na(dat$hhincome)] <- 
  sample(dat$hhincome[dat$hhincome != -3105 & !is.na(dat$hhincome)],
         sum(dat$hhincome == -3105 | is.na(dat$hhincome)), 
         replace = TRUE)

# Create survey design object for Lucid data

datsvy <- svydesign(ids = ~1, 
                    data = dat, 
                    weights = NULL)

# Rake to each population margin

datsvy_rk <- rake(datsvy, 
                  sample.margins = list(~female, ~race, ~ageXeduc, ~hhincome), 
                  population.margins = pop_targs)

# Trim extreme weights

datsvy_rk_trimed <- trimWeights(datsvy_rk, lower = 1/8, upper = 8, strict = T)

dat$weight_uspop <- weights(datsvy_rk_trimed)


#----------------------------------------------
# Clean survey data
#----------------------------------------------

# Recode party id

dat$pid3 <- NA
dat$pid3[dat$pid == "Democrat"] <- "D"
dat$pid3[dat$pid == "Republican"] <- "R"
dat$pid3[dat$pid_lean == "Closer to Democratic Party"] <- "D"
dat$pid3[dat$pid_lean == "Closer to Republican Party"] <- "R"
dat$pid3[dat$pid_lean == "Neither"] <- "I"

# Recode treatment indicators

dat$treat3 <- paste(dat$treat_vignette, dat$treat_vignette_copid, sep = "_")
dat$treat3[dat$treat3 %in% c("public_copid", "public_outpid")] <- "public"
dat$treat3[dat$treat_vignette == ""] <- NA
dat$treat3 <- factor(dat$treat3, levels = c("public", "partisan_copid", "partisan_outpid"))

dat$treat_pid <- "None"
dat$treat_pid[grep("Democrat", dat$vignette)] <- "D"
dat$treat_pid[grep("Republican", dat$vignette)] <- "R"

dat$treat_ipw <- recode(dat$treat3, "public" = 1/0.333333, 
                          "partisan_copid" = 1/0.233333, "partisan_outpid" = 1/0.433333)
dat$treat_ipw[dat$pid3 == "I"] <- 1/0.333333
dat$treat_ipw_norm <- dat$treat_ipw/sum(dat$treat_ipw, na.rm = TRUE)*nrow(dat)

# Recode perceptions

dat[,grep("how", names(dat))] <- apply(dat[,grep("how", names(dat))], 2, as.numeric)

dat$how_female <- dat$how_male_1 # Note: variable initially recorded with the wrong name in Qualtrics
dat$how_city <- dat$how_city_1
dat$how_young <- dat$how_young_1
dat$how_lgb <- dat$how_lgb_1
dat$how_college <- dat$how_college_1
dat$how_nonwhite <- 100 - dat$how_white_1
dat$how_nonrel <- 100 - dat$how_rel_1
dat$how_under200 <- dat$how_under200_1
dat$how_union <- dat$how_union_1

dat$how_female[is.na(dat$how_female)] <- c(100 - dat$how_female_1)[is.na(dat$how_female)]
dat$how_city[is.na(dat$how_city)] <- c(100 - dat$how_notcity_1)[is.na(dat$how_city)]
dat$how_young[is.na(dat$how_young)] <- c(100 - dat$how_old_1)[is.na(dat$how_young)]
dat$how_under200[is.na(dat$how_under200)] <- c(100 - dat$how_over200_1)[is.na(dat$how_under200)]

# Save cleaned data

write.csv(dat, "OrrHuber_POQ_Study2_clean.csv", row.names = FALSE)
